* prepare dataset to predict house prices in barcelona
********************************************************************************
* prelims
********************************************************************************
{
clear all
set more off
cap log close
}
********************************************************************************
* programs for common cleaning
********************************************************************************
{

** harmonize floors
cap program drop floor_har
program define floor_har 
	
	destring Floor, g(FloorH) force
	
	replace FloorH = 0 if regexm(Floor, "SM|PB|BJ|^B$|^TD$")
	replace FloorH = 0.5 if regexm(Floor, "EN|ENT|^E$|E1|E2")
	replace FloorH = 1 if regexm(Floor, "PR|PP")
	replace FloorH = NFloors if Floor == "AT"|Floor =="AL" /*AL='Altillo'*/
	forval i =1(1)9 {
	replace FloorH = `i' if Floor == "P`i'"|Floor=="A`i'"
	}
	forval i = 0(1)9 {
		replace FloorH = `i' if regexm(Floor, "`i'") 
	}
	replace FloorH = 0 if FloorH <= 0
	replace FloorH = 1 if FloorH == .
	
end	

* prediction variables
cap program drop pred_covariates
program define pred_covariates

	* add CSEC CDIS in year of transaction
merge m:1 PlotCode year using "$data/int/catastro_census_neigh_08_900.dta", keepusing(CSEC CMUN CPRO CDIS CUSEC BARRI Lon Lat)
keep if _merge == 3
drop _merge

* add plot characteristics
#d;
	local catvars ShrSqmAPLOT ShrSqmCPLOT ShrSqmGCPLOT ShrSqmGHPLOT 
	ShrSqmGRPLOT ShrSqmOPLOT ShrSqmVPLOT PropSqmPLOT_med PropSqmPLOT_sd
	MinQual MaxQual MedQual NProps
	MaxAptSize MinAptSize NApts NFloors
	ShrSqmCommonPLOT SqmCommonPLOT
	SqmPLOT SqmAPLOT SqmCPLOT SqmGCPLOT SqmGHPLOT SqmGRPLOT SqmOPLOT SqmVPLOT
	PropSqmPLOT_mean
	;
#d cr
	
merge m:1 PlotCode using "$data/int/catastro_plot_chars.dta", keepusing(`catvars')
drop if _merge == 2
drop _merge

* add new apartment buildings info
rename year Year

local vars NewAptBldings RefAptBldings
merge m:1 CPRO CMUN CDIS CSEC Year using "$data/int/catastro_blng_expan.dta", keepusing(`vars')
drop if _merge==2
drop _merge

replace NewAptBldings = 0 if NewAptBldings == .
replace RefAptBldings = 0 if RefAptBldings == .

* add catastro census tract characteristics 
#d;
	local vars ShrSqmACSEC ShrSqmCCSEC ShrSqmECSEC ShrSqmGCCSEC ShrSqmGHCSEC 
	ShrSqmGRCSEC ShrSqmICSEC 
	ShrSqmOCSEC ShrSqmPCSEC ShrSqmRCSEC ShrSqmVCSEC ShrSqmTCSEC ShrSqmYCSEC
	SqmCSEC PropSqmCSEC_mean PropSqmCSEC_sd  
	MeanYearConstCSEC MeanNFloorsCSEC MedQualCSEC
	;
#d cr
merge m:1 CPRO CMUN CDIS CSEC Year using "$data/int/catastro_csec_chars.dta", keepusing(`vars')
drop if _merge==2
drop _merge

foreach var in `vars' {
	bys CDIS Year: egen mean_`var' = mean(`var')
	replace `var' = mean_`var' if `var' == .
	drop mean_`var'

}

rename Year year

* max number of floors
g FloorMatchRes = FloorMatch
replace FloorMatchRes = 10 if FloorMatch >= 10
la var FloorMatchRes "Floor Match (Capped at 10th Floor)"

* construction age
g ConstAge = year - YearConst + 1
replace ConstAge = 1 if ConstAge <= 0
la var ConstAge "Age of the Construction"

* age
g ConstActAge = (year - PropYear + 1)
replace ConstActAge = ConstAge if ConstActAge > ConstAge & ConstActAge != .
replace ConstActAge = 1 if ConstActAge <= 0
la var ConstActAge "Construction Age (since reform)"

* reforms
g RecentRef = (PropYear > YearConst & PropYear != . & (ConstActAge <= 3))

* add INE registry data
#d;
	local regvars pop* esp_nationality foreign_nnt africa_nnt america_nnt asia_nnt 
	europe_nnt rest_nnt eu_nnt esp same_reg same_reg_pro_mun 
	dif_reg
	;
#d cr
g gender = "as"
merge m:1 CPRO CMUN CDIS CSEC gender year using "$data/int/reg_csec.dta", keepusing(`regvars')
drop if _merge==2
drop _merge

* add 2011 census info
rename CSEC t_CSEC
rename CDIS t_CDIS

rename year t_year
g year = 2011
merge m:1 PlotCode year using "$data/int/catastro_census_neigh_08_900.dta", keepusing(CSEC CDIS CMUN CPRO AreaSqm)

replace CSEC = t_CSEC if _merge==1 & t_CSEC!=""
replace CDIS = t_CDIS if _merge==1 & t_CDIS!=""

drop if _merge == 2
drop _merge year

* census vars to use
#d;
	global censusvars single married separated divorced illiterate no_educ 
	primary_educ secondary_educ tertiary_educ tot_house prim_house 
	sec_house vac_house h_own_paid h_own_pend h_rent h_own_inh h_cede h_30to45 
	h_46to60 h_61to75 h_76to90 h_91to105 h_106to120 h_121to150 h_151to180 
	h_more180 h_2res h_3res h_4res h_5res h_6res h_7res h_8res h_9moreres 
	hh_1per hh_2per hh_3per hh_4per hh_5per hh_6per
	;
#d cr

merge m:1 CPRO CMUN CDIS CSEC using "$data/int/census2011_csec.dta", keepusing($censusvars)
drop if _merge == 2
drop _merge

foreach var in $censusvars {
	
	rename `var' t_`var'
	
}

* assign district when missing
merge m:1 CPRO CMUN CDIS using "$data/int/census2011_cdis.dta", keepusing($censusvars)
drop if _merge == 2
drop _merge

foreach var in $censusvars {
	
	replace t_`var' = `var' if `var' !=. & t_`var' == .
	drop `var'
	rename t_`var' `var' 
	
}

* add TMB distance to metro stations
* distance from tract
merge m:1 CDIS CSEC using "$data/int/distance_trans.dta", keepusing(Distance1 Distance2 Distance3)
drop if _merge == 2
drop _merge
* assign mean distance in neighborhood if missing
merge m:m BARRI using "$data/int/distance_trans.dta", keepusing(MeanD*)
drop _merge
replace Distance1 = MeanD1 if Distance1==. & MeanD1!=.
replace Distance2 = MeanD2 if Distance2==. & MeanD2!=.
replace Distance3 = MeanD3 if Distance3==. & MeanD3!=.
drop MeanD*

* add sales data from ajuntament bcn
g year = 2019
merge m:1 BARRI year using "$data/int/ajbcn_houseprice_barri.dta", keepusing(pricesqm)
drop if _merge==2
drop _merge
rename pricesqm pricesqmBARRI2019

merge m:1 CDIS year using "$data/int/ajbcn_houseprice_cdis.dta", keepusing(pricesqm)
drop if _merge==2
drop _merge
rename pricesqm pricesqmCDIS

* add INE atlas renta data
g Year = 2017 
	
#d;
	global inevars MedianPersInc MedianHHInc IncSourceWage
	IncSourcePens IncSourceUI IncSourceOtherSub IncSourceOtherInc
	;
#d cr
	
merge m:1 CPRO CMUN CDIS CSEC Year using "$data/int/ine_atlas_renta_csec.dta", keepusing($inevars)
drop if _merge == 2
drop _merge

foreach var in $inevars {
	rename `var' t_`var'
}

merge m:1 CPRO CMUN CDIS Year using "$data/int/ine_atlas_renta_cdis.dta", keepusing($inevars)
drop if _merge == 2
drop _merge

foreach var in $inevars {
	
	replace t_`var' = `var' if `var' !=. & t_`var' == .
	drop `var'
	rename t_`var' `var' 
	
}

foreach var in $inevars {
	rename `var' t_`var'
}

merge m:1 CPRO CMUN Year using "$data/int/ine_atlas_renta_cmun.dta", keepusing($inevars)
drop if _merge == 2
drop _merge

foreach var in $inevars {
	replace t_`var' = `var' if `var' !=. & t_`var' == .
	drop `var'
	rename t_`var' `var' 
}

replace CSEC = t_CSEC
replace CDIS = t_CDIS
drop t_CSEC t_CDIS
drop Year

g lnMedianPersInc = ln(MedianPersInc)
g lnMedianHHInc = ln(MedianHHInc)
replace IncSourceWage = IncSourceWage / 100
replace IncSourcePens = IncSourcePens  / 100
replace IncSourceUI = IncSourceUI  / 100
replace IncSourceOtherSub = IncSourceOtherSub  / 100
replace IncSourceOtherInc = IncSourceOtherInc / 100

drop year
rename t_year Year

* drop vars
drop esp pop_85 gender House Apartment gender rest_nnt

* additional variables
g popagg_0019 = pop_0004 + pop_0509 + pop_1014 + pop_1519
g popagg_2039 = pop_2024 + pop_2529 + pop_3034 + pop_3539
g popagg_4064 = pop_4044 + pop_4549 + pop_5054 + pop_5569 + pop_6064
g popagg_65100 = pop_6569 + pop_7074 + pop_7579 + pop_8084 + pop_8589 + pop_9094 + pop_9599 + pop_100

drop pop_*

g PopDensity = pop / AreaSqm

* vars in logs
* ine
#d;
	local inevars MedianPersInc MedianHHInc 
	;
#d cr
* census
#d;
	local censusvars single married separated divorced illiterate no_educ 
	primary_educ secondary_educ tertiary_educ tot_house prim_house 
	sec_house vac_house h_own_paid h_own_pend h_rent h_own_inh h_cede h_30to45 
	h_46to60 h_61to75 h_76to90 h_91to105 h_106to120 h_121to150 h_151to180 
	h_more180 h_2res h_3res h_4res h_5res h_6res h_7res h_8res h_9moreres 
	hh_1per hh_2per hh_3per hh_4per hh_5per hh_6per
	pop esp_nationality foreign_nnt same_reg_pro_mun
	;
#d cr
* demographic vars
#d;
	local demvars popagg_0019 popagg_2039 popagg_4064 popagg_65100 europe_nnt
	africa_nnt america_nnt asia_nnt eu_nnt
	same_reg dif_reg
	;
#d cr

* catastro vars
#d;
	local catvars PropSqmPLOT_med PropSqmPLOT_sd NProps
	SqmCSEC PropSqmCSEC_mean PropSqmCSEC_sd  
	MeanYearConstCSEC MeanNFloorsCSEC ConstActAge PropSqm AreaSqm
	MaxAptSize MinAptSize NApts
	ShrSqmCommonPLOT SqmCommonPLOT
	SqmPLOT SqmAPLOT SqmCPLOT SqmGCPLOT SqmGHPLOT SqmGRPLOT SqmOPLOT SqmVPLOT
	PropSqmPLOT_mean
	;
#d cr
* stations
local locvars Distance1 Distance2 Distance3
* other
local other pricesqmBARRI2019 pricesqmCDIS

* generate vars
foreach var in `demvars' `locvars' `censusvars' `inevars' `rentalvars' `other' `catvars' {

	replace `var' = 0 if `var' == .
	cap g ln`var' = ln(`var' + 1)
	drop `var'

}

g lnConstAge =  ln(ConstAge)
g ConstAgeSq = ConstAge * ConstAge
g lnConstAgeSq =  ln(ConstAgeSq)
drop ConstAgeSq

* quality
foreach var in QualConst MinQual MaxQual MedQual MedQualCSEC {

	g `var'Scaled = (`var' * -1) + 10
	replace `var'Scaled = 5 if `var'Scaled == .
	drop `var'

}

drop CPRO CMUN
rename Year year

end 

}
********************************************************************************
* ATC data
********************************************************************************
{
* ATC Data
use "$data/int/ATC-ITP_2009_19.dta", clear

* only barcelona
keep if CPRO=="08" & CMUN=="019"
drop CensusYear CSEC CDIS
drop AEB ZUA LITERAL

* first sale
g FirstSale = (CODITARIFA == "AJ4" & (ANYMERITAC - ANYANTIGUI < 3))

* Floors
rename PARCELLA PlotCode
merge m:1 PlotCode using "$data/int/catastro_plot_chars.dta", keepusing(NFloors)
keep if _merge==3
drop _merge

* add info on apartment units
merge m:m PlotCode using "$data/int/08_900.dta", keepusing(Floor PropSqm House Apartment PropYear YearConst SqmA SqmV)
drop if _merge==2
drop _merge

drop if substr(Floor,1,1)=="-"

g x = (SqmV > 0)
bys OBJECTID PlotCode: egen totx = total(x)
drop if SqmV == 0 & totx > 0 
drop x totx

drop SqmA SqmV

** harmonize floor
gsort OBJECTID PlotCode DATAMERITA Floor PropSqm
duplicates drop OBJECTID PlotCode DATAMERITA Floor PropSqm, force

* program
floor_har
drop Floor
g MatchSqm = (PropSqm == SUPCONSTRU)

bys OBJECTID PlotCode: egen NMatch = sum(MatchSqm)

* if there is at least one match, get rid of unmatched floors
drop if NMatch>=1 & MatchSqm==0

* assign matches
g FloorMatch = FloorH if NMatch == 1
g SureFloorMatch = (NMatch == 1)

* generate distance
g SqmDiff = (PropSqm - SUPCONSTRU)

* fuzzy matching
foreach dist in 5 10 {

	g MatchSqmFuzzy = (SqmDiff>= - `dist' & SqmDiff <= `dist' & NMatch == 0)
	bys OBJECTID PlotCode: egen NMatchFuzzy = sum(MatchSqmFuzzy)
	replace FloorMatch = FloorH if NMatchFuzzy == 1 & FloorMatch == .
	drop MatchSqmFuzzy NMatchFuzzy

}

drop SqmDiff

* if floor still missing, assign average floor
bys OBJECTID PlotCode: egen MedFloor = median(FloorH)
replace FloorMatch = MedFloor if NMatch == 0 & FloorMatch == .
replace FloorMatch = round(FloorMatch) if FloorMatch!=0.5
drop MedFloor

* drop ATC duplicates
gsort OBJECTID PlotCode DATAMERITA -MatchSqm -SureFloorMatch
duplicates drop OBJECTID PlotCode DATAMERITA, force

drop MatchSqm NMatch SureFloorMatch

* year
rename ANYMERITAC year
la var year "transaction year"

* property square meters
replace PropSqm = SUPCONSTRU 
drop SUPCONSTRU

* price
rename VTOTALDECL price
g lnP = ln(price)

* quality
destring CATEGORIA, g(QualConst) force
replace QualConst = 1 if (CATEGORIA=="A"|CATEGORIA=="B"|CATEGORIA=="C")
drop CATEGORIA

* age
replace PropYear = ANYANTIGUI if ANYANTIGUI > PropYear & ANYANTIGUI<year & (ANYANTIGUI-PropYear>5)
replace PropYear = ANYANTIGUI if year < PropYear
replace YearConst = ANYANTIGUI if year < YearConst
replace YearConst = ANYANTIGUI if YearConst == . & ANYANTIGUI!=.
drop ANYANTIGUI

* UC Type
drop TIPOLOGIA

* corrections
replace Apartment = 1 if Apartment==0 & House==0
drop OBJECTID FloorH

* prediction covariates
pred_covariates

replace RecentRef = 0 if FirstSale==1

drop X_ATC Y_ATC DATAMERITA CMUN_DC MUNDISSEC CCA CLAUCADAST CODITARIFA

* store
compress
save "$data/int/ATC_predict.dta", replace

}
********************************************************************************
* Catastro data
********************************************************************************
{
	
clear

forval year = 2009(1)2019 {
	
	* add data
	append using "$data/int/catastro_prop_chars.dta", keep(PlotCode PropOrder* Floor *ZIP* House Apartment YearConst PropYear NFloors QualConst PropSqm ShrSqmAPROP ShrSqmVPROP)
	* keep houses or apartments
	keep if ((House==1|Apartment==1))
	* restrict data
	keep if (YearConst <= `year')
	cap g year = `year'
	replace year = `year' if year == .

}

* harmonize floor
floor_har
drop Floor
g FloorMatch = FloorH 
drop FloorH

* first sale
g FirstSale = (year - YearConst <= 3)

* prediction covariates
pred_covariates

* store
forval year = 2019(-1)2009 {
	
	preserve

	* keep only data for year
	keep if year == `year'
	compress
	save "$data/temp/catdata_predict_`year'.dta", replace

	restore
	
}
}
********************************************************************************
* closing
********************************************************************************
{
cap log close
clear
}
